{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import csv\n",
    "import time\n",
    "import jieba\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import layers\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "assert tf.__version__.startswith('2.')\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "gpus = tf.config.experimental.list_physical_devices('GPU')\n",
    "tf.config.experimental.set_memory_growth(gpus[0], True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cn_model = KeyedVectors.load_word2vec_format('./sgns.zhihu.bigram', binary=False)#预训练词向量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.66128117\n",
      "[('硕士', 0.8219336271286011), ('本科', 0.7795450687408447), ('读研', 0.7171738147735596), ('PHD', 0.7077330350875854), ('本科生', 0.7003262042999268), ('大学本科', 0.698881208896637), ('博士生', 0.6916512250900269), ('博士', 0.6902437210083008), ('硕博', 0.6894769668579102), ('硕士生', 0.6863948106765747)]\n",
      "[('劈腿', 0.5849199295043945), ('婚外情', 0.5557921528816223), ('偷情', 0.5555664300918579), ('外遇', 0.5458645820617676), ('再婚', 0.5422405004501343), ('未婚先孕', 0.5357398986816406), ('隐婚', 0.5257365703582764), ('离婚', 0.524539053440094), ('马蓉', 0.5239365696907043), ('通奸', 0.5222055912017822)]\n",
      "老人\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ZZK\\Anaconda3\\envs\\learn\\lib\\site-packages\\gensim\\models\\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
      "  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n"
     ]
    }
   ],
   "source": [
    "print(cn_model.similarity('橙子', '橘子'))\n",
    "print(cn_model.most_similar(positive=['研究生'], topn=10))\n",
    "print(cn_model.most_similar(positive=['女人','出轨'], negative=['男人'], topn=10))\n",
    "print(cn_model.doesnt_match(['会计师','程序员','律师','医生','老人']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000\n",
      "平均句长: 18.1518 最长句长: 154\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de5gdVZnv8e+PBAiXQIA0CJ1Ag0QuOoDQahS8YBgPNw1nBMERCBiNjggIeDR4RY9zhNEBZHTQyMWAgGAEiYRBmAAHfbhIAyHhokMmQNIkJs0tBKJI8J0/am17Z2d3V3XS1Xt39+/zPPvZVatWrXp7d7LfrrWqVikiMDMz681GjQ7AzMyan5OFmZnlcrIwM7NcThZmZpbLycLMzHI5WZiZWS4nCytE0g8lfbWf2tpZ0suSRqT1OyV9oj/aTu39h6Qp/dVeH477LUnPSvpjP7T1Pkmd/RHXBsQQknZvwHEb/rPbupwsDElPSfqTpFWSXpR0t6RPS/rbv4+I+HRE/N+CbR3SW52IWBwRW0bE6/0Q+zmSflrT/mERMXND2+5jHOOBs4C9I+INdbb7C7AHjUpK1jdOFlbxwYgYDewCnAt8Ebi0vw8iaWR/t9kkdgGei4gVjQ7ErAxOFraWiFgZEbOBY4Epkt4CIOknkr6VlsdKuimdhTwv6TeSNpJ0JbAz8KvUzfQFSW3pL8epkhYDt1eVVSeON0r6naSVkm6UtG061jp/kVfOXiQdCnwJODYd7+G0/W/dWimur0h6WtIKSVdI2jptq8QxRdLi1IX05Z4+G0lbp/27UntfSe0fAtwG7JTi+EnNflsA/1G1/WVJO0naVNKFkpam14WSNu3h2KdJekzSuLR+pKR5VWeC+9R8Pp+XND99ntdKGtXb766XfxKVNjeV9N30OS1P3ZKbVf+OJJ2VPuNlkk6u2nc7Sb+S9JKk+1N33W/TtrtStYfT53Js1X5127PGcLKwuiLid0An8O46m89K21qAHci+sCMiTgAWk52lbBkR/1K1z3uBvYD/1cMhTwQ+DuwErAEuKhDjLcD/A65Nx9u3TrWT0utgYDdgS+D7NXUOAvYAJgFfk7RXD4f8N2Dr1M57U8wnR8R/AocBS1McJ9XE+UrN9i0jYinwZWAisB+wL/B24Cu1B1U2VnQS8N6I6JS0P3AZ8ClgO+BHwOyaRPMR4FBgV2CftD/08Lvr4eetdh7wphTr7kAr8LWq7W9In00rMBX4gaRt0rYfAK+kOlPSq/LZvCct7ps+l2sLtGcN4GRhvVkKbFun/DVgR2CXiHgtIn4T+ZOMnRMRr0TEn3rYfmVEPJK+WL8KfERpAHwDfQw4PyIWRcTLwNnAcTVnNd+IiD9FxMPAw2Rf3GtJsRwLnB0RqyLiKeBfgRM2MLZvRsSKiOgCvlHTniSdT5ZgD051AD4J/Cgi7ouI19P4zKtkiafioohYGhHPA78i+5KH9fjdSVI65hkR8XxErCJL0sdVVXst/SyvRcTNwMvAHulz+zDw9YhYHRGPAUXGk+q2V2A/K4mThfWmFXi+Tvl3gIXArZIWSZpeoK0lfdj+NLAxMLZQlL3bKbVX3fZIsr+qK6qvXlpNdvZRayywSZ22Wvs5tp2q1scA04BvR8TKqvJdgLNSV9KLkl4Extfs29PPtD6/uxZgc+CBquPdksornouINXWO2UL2eVf/fvP+LfTWnjWIk4XVJeltZF+Ev63dlv6yPisidgM+CJwpaVJlcw9N5p15jK9a3pnsL8tnybovNq+KawRrf0nltbuU7Mu1uu01wPKc/Wo9m2KqbeuZgvvXi7NebEur1l8AjgQul3RgVfkS4J8jYkzVa/OIuCY3iN5/dz15FvgT8Oaq420dEUW+vLvIPu9xVWXje6hrTczJwtYiaStJRwI/A34aEQvq1DlS0u6pe+Il4PX0guxLeLf1OPTxkvaWtDnwTWBWurT2v4BRko6QtDFZn3513/xyoK2XQdprgDMk7SppS7rHONb0UL+uFMt1wD9LGi1pF+BM4Ke977lWnNtVBterYvuKpBZJY8nGAGovA76TrLvqBknvSMU/Bj4t6R3KbJE+n9F5QeT87uqKiL+mY14gafvUTquknsafqvd9HbgeOEfS5pL2JBvrqba+/2ZsADlZWMWvJK0i+6v1y8D5QE9XoEwA/pOsH/ke4N/TlxrAt8m+AF+U9Pk+HP9K4Cdk3SejgNMguzoL+AxwCdlf8a+QDdBW/Dy9PyfpwTrtXpbavgt4EvgzcGof4qp2ajr+IrIzrqtT+7ki4vdkyWFR+mx2Ar4FdADzgQXAg6msdt/byH4XsyUdEBEdZGMI3yc7+1hI9wB2nt5+d735YjrOvZJeSm0UHUP4LNlg9R/JfhfXkI2xVJwDzEyfy0cKtmkDTH74kZkNJEnnAW+IiAG/y97Wn88szKxUkvaUtE/qMns72aWwNzQ6LuuboXo3rZk1j9FkXU87ASvILjm+saERWZ+5G8rMzHK5G8rMzHIN6m6osWPHRltbW6PDMDMbVB544IFnI6Ilv2a3QZ0s2tra6OjoaHQYZmaDiqSn82utrdRuKElnSHpU0iOSrpE0Kt0cdZ+kJ9JsmJukupum9YVpe1uZsZmZWXGlJQtJrWQ3VrVHxFuAEWQTj50HXBARE8huKJqadpkKvBARuwMXpHpmZtYEyh7gHglslmb43BxYBrwfmJW2zwSOSsuT6Z6NchYwKU1JYGZmDVZasoiIZ4Dvkj3fYBmwEngAeLFqXp5OumftbCXNRpm2rySbq38tkqZJ6pDU0dXVVbvZzMxKUGY31DZkZwu7kt2MswXZA2BqVW70qHcWsc5NIBExIyLaI6K9paVPg/lmZraeyuyGOgR4MiK6IuI1spkn3wWMqXrwzDi6p2TuJE1dnLZvTf1nKZiZ2QArM1ksBiamaYlF9sjKx4A7gKNTnSl03/Y/m+7HLR4N3F7g6WtmZjYAyhyzuI9soPpBsumXNwJmkE11fKakhWRjEpemXS4lm+9/IdlzAoo8wcvMzAbAoJ4bqr29PXxTnplZ30h6ICLa+7KP54Ya4tqmz2l0CGY2BDhZmJlZLicLMzPL5WQxxLRNn+OuJzPrd04WZmaWy8nCzMxyOVmYmVkuJwszM8vlZGFmZrmcLMzMLJeThZmZ5XKyMDOzXE4WZmaWy8liGPHd3Wa2vpwszMwsl5OFmZnlcrIwM7NcpSULSXtImlf1eknS5yRtK+k2SU+k921SfUm6SNJCSfMl7V9WbGZm1jdlPoP7DxGxX0TsBxwArAZuIHu29tyImADMpftZ24cBE9JrGnBxWbGZmVnfDFQ31CTgvyPiaWAyMDOVzwSOSsuTgSsicy8wRtKOAxSfmZn1YqCSxXHANWl5h4hYBpDet0/lrcCSqn06U9laJE2T1CGpo6urq8SQzcysovRkIWkT4EPAz/Oq1imLdQoiZkREe0S0t7S09EeIZmaWYyDOLA4DHoyI5Wl9eaV7Kb2vSOWdwPiq/cYBSwcgPjMzyzEQyeKjdHdBAcwGpqTlKcCNVeUnpquiJgIrK91VZmbWWCPLbFzS5sDfA5+qKj4XuE7SVGAxcEwqvxk4HFhIduXUyWXGZmZmxZWaLCJiNbBdTdlzZFdH1dYN4JQy4zEzs/XjO7jNzCyXk4WZmeVysjAzs1xOFmZmlsvJwszMcjlZmJlZLicLMzPL5WRhZma5nCzMzCyXk4WZmeVysjAzs1xOFmZmlsvJwszMcjlZmJlZLieLQaxt+hzaps9pdBhmNgw4WZiZWS4ni2HKZyVm1hdOFmZmlqvUZCFpjKRZkn4v6XFJ75S0raTbJD2R3rdJdSXpIkkLJc2XtH+ZsZmZWXFln1l8D7glIvYE9gUeB6YDcyNiAjA3rQMcBkxIr2nAxSXHZmZmBZWWLCRtBbwHuBQgIv4SES8Ck4GZqdpM4Ki0PBm4IjL3AmMk7VhWfGZmVlyZZxa7AV3A5ZIeknSJpC2AHSJiGUB63z7VbwWWVO3fmcrWImmapA5JHV1dXSWGb2ZmFWUmi5HA/sDFEfFW4BW6u5zqUZ2yWKcgYkZEtEdEe0tLS/9EamZmvSozWXQCnRFxX1qfRZY8lle6l9L7iqr646v2HwcsLTE+MzMrqLRkERF/BJZI2iMVTQIeA2YDU1LZFODGtDwbODFdFTURWFnprjIzs8YaWXL7pwJXSdoEWAScTJagrpM0FVgMHJPq3gwcDiwEVqe6ZmbWBEpNFhExD2ivs2lSnboBnFJmPGZmtn58B7eZmeVysjAzs1xOFmZmlis3WUg6MN1Mh6TjJZ0vaZfyQzMzs2ZR5MziYmC1pH2BLwBPA1eUGpWZmTWVIsliTbpSaTLwvYj4HjC63LDMzKyZFLl0dpWks4HjgfdIGgFsXG5YZmbWTIqcWRwLvApMTXdltwLfKTUqMzNrKrlnFilBnF+1vhiPWZiZDStFrob6h/RUu5WSXpK0StJLAxGcmZk1hyJjFv8CfDAiHi87GDMza05FxiyWO1GYmQ1vRc4sOiRdC/ySbKAbgIi4vrSozMysqRRJFluRTRn+gaqyAJwszMyGiSJXQ/m5EmZmw1yRq6HeJGmupEfS+j6SvlJ+aGZm1iyKDHD/GDgbeA0gIuYDx5UZlJmZNZciyWLziPhdTdmaMoIxM7PmVCRZPCvpjWSD2kg6GlhWpHFJT0laIGmepI5Utq2k29KNfrdJ2iaVS9JFkhZKmi9p//X8mczMrJ8VSRanAD8C9pT0DPA54J/6cIyDI2K/iKg8i3s6MDciJgBz0zrAYcCE9JpGNjW6DZC26XMaHYKZNbEiyeKZiDgEaAH2jIiDgA2Z7mMyMDMtzwSOqiq/IjL3AmMk7bgBxzEzs35SJFlcL2lkRLwSEaskvQG4rWD7Adwq6QFJ01LZDhGxDCC9b5/KW4ElVft2prK1SJomqUNSR1dXV8EwzMxsQxRJFr8EZkkaIakNuJXs6qgiDoyI/cm6mE6R9J5e6qpOWaxTEDEjItojor2lpaVgGINf2/Q57ioys4YpclPejyVtQpY02oBPRcTdRRqPiKXpfYWkG4C3A8sl7RgRy1I304pUvRMYX7X7OGBp4Z/EzMxK0+OZhaQzKy9gFNkX+TxgYirrlaQtJI2uLJNNF/IIMBuYkqpNAW5My7OBE9NVUROBlZXuKjMza6zezixqn7N9Qw/lPdkBuEFS5ThXR8Qtku4HrpM0FVgMHJPq3wwcDiwkm4vK04yYmTWJHpNFRHyjej2dJUREvFyk4YhYBOxbp/w5YFKd8iC7TNfMzJpMkbmh3iLpIbIupEfTlU1vLj80MzNrFkWuhpoBnBkRu0TELsBZZPNFmZnZMFEkWWwREXdUViLiTmCL0iIyM7OmU+ThR4skfRW4Mq0fDzxZXkhmZtZsipxZfJxsqo/r02sscFKJMZmZWZMpcmZxSEScVl0g6Rjg5+WEZGZmzabImUW9qT2KTvdhZmZDQI9nFpIOI7tJrlXSRVWbtsIPPzIzG1Z664ZaCnQAHwIeqCpfBZxRZlBmZtZceruD+2HgYUlXR8RrAxiTmZk1mdwxCycKMzMrMsBtZmbDXG9TlF+Z3k8fuHDMzKwZ9XZmcYCkXYCPS9pG0rbVr4EK0MzMGq+3q6F+CNwC7EZ2NVT1Y08jlZuZ2TDQ45lFRFwUEXsBl0XEbhGxa9XLicLMbBgp8gzuf5K0L/DuVHRXRMwvNywzM2smRR5+dBpwFbB9el0l6dSiB5A0QtJDkm5K67tKuk/SE5KulbRJKt80rS9M29vW5wcyM7P+V+TS2U8A74iIr0XE14CJwCf7cIzTgcer1s8DLoiICcALwNRUPhV4ISJ2By5I9czMrAkUSRYCXq9af521B7t73lEaBxwBXJLWBbwfmJWqzASOSsuT0zpp+6RU38zMGqzIFOWXA/dJuiGtHwVcWrD9C4EvAKPT+nbAixFRmYiwE2hNy63AEoCIWCNpZar/bHWDkqYB0wB23nnngmGYmdmGKDLdx/nAycDzZN1GJ0fEhXn7SToSWBER1ZMQ1jtTiALbquOZERHtEdHe0tKSF4aZmfWDImcWRMSDwIN9bPtA4EOSDgdGkU1tfiEwRtLIdHYxjmx2W8jOMsYDnZJGAluTJSgbIG3T5wDw1LlHNDgSM2s2pc0NFRFnR8S4iGgDjgNuj4iPAXcAR6dqU4Ab0/LstE7afntErHNmYWZmA68REwl+EThT0kKyMYnK+MelwHap/ExgegNiMzOzOnrthpI0Avh1RByyIQeJiDuBO9PyIuDtder8GThmQ44z2LkbyMyaVa9nFhHxOrBa0tYDFI+ZmTWhIgPcfwYWSLoNeKVSGBGnlRaVmZk1lSLJYk56mZnZMFVkIsGZkjYDdo6IPwxATGZm1mSKTCT4QWAe2bMtkLSfpNllB2ZmZs2jyKWz55BdvfQiQETMA3YtMSZrEpWrs8zMiiSLNRGxsqbMN8uZmQ0jRQa4H5H0j8AISROA04C7yw3LzMyaSZEzi1OBNwOvAtcALwGfKzMoMzNrLkWuhloNfFnSedlqrCo/LDMzayZFroZ6m6QFwHyym/MelnRA+aGZmVmzKDJmcSnwmYj4DYCkg8geiLRPmYGZmVnzKDJmsaqSKAAi4reAu6KGibbpc3wJrZn1fGYhaf+0+DtJPyIb3A7gWNIMsmZmNjz01g31rzXrX69a9n0WZmbDSI/JIiIOHshAzMyseeUOcEsaA5wItFXX9xTlZmbDR5GroW4G7gUWAH8tNxwzM2tGRZLFqIg4s68NSxoF3AVsmo4zKyK+LmlX4GfAtsCDwAkR8RdJmwJXAAcAzwHHRsRTfT2umZn1vyKXzl4p6ZOSdpS0beVVYL9XgfdHxL7AfsChkiYC5wEXRMQE4AVgaqo/FXghInYHLkj1zMysCRRJFn8BvgPcAzyQXh15O0Xm5bS6cXoF8H5gViqfCRyVlienddL2SZJUID4zMytZkW6oM4HdI+LZvjYuaQRZctkd+AHw38CLEbEmVekEWtNyK7AEICLWSFoJbAc8W9PmNGAawM4779zXkMzMbD0UObN4FFi9Po1HxOsRsR8wjuwBSnvVq5be651FrHM/R0TMiIj2iGhvaWlZn7DMzKyPipxZvA7Mk3QH2TgE0LdLZyPiRUl3AhOBMZJGprOLccDSVK0TGA90ShoJbA08X/QYZmZWniLJ4pfp1SeSWoDXUqLYDDiEbND6DuBosiuipgA3pl1mp/V70vbbI8J3ipuZNYEiz7OYmVenBzsCM9O4xUbAdRFxk6THgJ9J+hbwENmstqT3KyUtJDujOG49j2tmZv2syB3cT1J/7GC33vaLiPnAW+uULyIbv6gt/zNwTF48ZmY28Ip0Q7VXLY8i+0Ivcp+FmZkNEblXQ0XEc1WvZyLiQrJ7JczMbJgo0g21f9XqRmRnGqNLi8jMzJpOkW6o6udarAGeAj5SSjRmZtaUilwN5eda2N9UP2L1qXOPaGAkZjaQinRDbQp8mHWfZ/HN8sIyM7NmUqQb6kZgJdkcT6/m1DUzsyGoSLIYFxGHlh6JmZk1rSITCd4t6e9Kj8TMzJpWkTOLg4CT0p3cr5LNDhsRsU+pkZmZWdMokiwOKz2KYaZt+hxfSWRmg0qRS2efHohAbPCpXEabl/iK1jOz5lVkzMLMzIY5JwszM8vlZGFmZrmcLMzMLJeThW2wtulz1pozysyGntKShaTxku6Q9LikRyWdnsq3lXSbpCfS+zapXJIukrRQ0vyaqdHNzKyByjyzWAOcFRF7AROBUyTtDUwH5kbEBGBuWofsfo4J6TUNuLjE2MzMrA9KSxYRsSwiHkzLq4DHgVZgMjAzVZsJHJWWJwNXROZeYIykHcuKz8zMihuQMQtJbcBbgfuAHSJiGWQJBdg+VWsFllTt1pnKzMyswUpPFpK2BH4BfC4iXuqtap2yqNPeNEkdkjq6urr6K0wzM+tFqclC0sZkieKqiLg+FS+vdC+l9xWpvBMYX7X7OGBpbZsRMSMi2iOivaWlpbzgzczsb8q8GkrApcDjEXF+1abZwJS0PIXs4UqV8hPTVVETgZWV7iozM2usIrPOrq8DgROABZLmpbIvAecC10maCiwGjknbbgYOBxYCq4GTS4zNzMz6oLRkERG/pf44BMCkOvUDOKWseMzMbP35Dm4zM8vlZGFmZrmcLMzMLJeThfWbvkwm6MkHzQYXJwszM8vlZGFmZrmcLMzMLJeThZmZ5XKyMDOzXE4WVgpf7WQ2tDhZmJlZLieLfuK/pM1sKHOyMDOzXGVOUW7DkM+uzIYmn1lYqZw8zIYGJwszM8vlZGFmZrmcLMzMLFdpyULSZZJWSHqkqmxbSbdJeiK9b5PKJekiSQslzZe0f1lxWXPy2IZZcyvzzOInwKE1ZdOBuRExAZib1gEOAyak1zTg4hLjMjOzPiotWUTEXcDzNcWTgZlpeSZwVFX5FZG5FxgjaceyYrPGKHrjom9wNGs+Az1msUNELANI79un8lZgSVW9zlS2DknTJHVI6ujq6io1WCuHE4HZ4NMsA9yqUxb1KkbEjIhoj4j2lpaWksMyMzMY+GSxvNK9lN5XpPJOYHxVvXHA0gGOzczMejDQyWI2MCUtTwFurCo/MV0VNRFYWemuMjOzxittbihJ1wDvA8ZK6gS+DpwLXCdpKrAYOCZVvxk4HFgIrAZOLisuG9oq4yFPnXtEgyMxG1pKSxYR8dEeNk2qUzeAU8qKxQantulz1vtL30nDrH81ywC3WV0behmtr7wy6x9OFjZkODGYlcfJwszMcvnhR9ZQRc8GPAZh1lg+szAzs1xOFjZoeQ4ps4HjbihrGkUnGQR3R5kNNCcLG5SqE4vPLszK524oG/LcXWW24ZwsbNhw0jBbf04WZmaWy8miDv8Fama2NicLG5b8B4FZ3zhZ2LBTL0k4cZj1zpfO2rBW7xJc38Nhti6fWZiZWS4nCzMzy+VkYVaj3uB3kQFxj3vYUNZUYxaSDgW+B4wALomIcxscklmfkkBtXY9/2FDRNMlC0gjgB8DfA53A/ZJmR8RjjY3MhitfNWXWrZm6od4OLIyIRRHxF+BnwOTedljwzEr/5zUzGwCKiEbHAICko4FDI+ITaf0E4B0R8dmaetOAaWn1LcAjAxpo8xoLPNvoIJqEP4tu/iy6+bPotkdEjO7LDk3TDQWoTtk6mSwiZgAzACR1RER72YENBv4suvmz6ObPops/i26SOvq6TzN1Q3UC46vWxwFLGxSLmZlVaaZkcT8wQdKukjYBjgNmNzgmMzOjibqhImKNpM8Cvya7dPayiHg0Z7cZ5Uc2aPiz6ObPops/i27+LLr1+bNomgFuMzNrXs3UDWVmZk3KycLMzHIN2mQh6VBJf5C0UNL0RsfTKJLGS7pD0uOSHpV0eqNjaiRJIyQ9JOmmRsfSaJLGSJol6ffp38c7Gx1TI0g6I/3feETSNZJGNTqmgSTpMkkrJD1SVbatpNskPZHet8lrZ1Ami6qpQQ4D9gY+KmnvxkbVMGuAsyJiL2AicMow/iwATgceb3QQTeJ7wC0RsSewL8Pwc5HUCpwGtEfEW8gunjmusVENuJ8Ah9aUTQfmRsQEYG5a79WgTBasx9QgQ1VELIuIB9PyKrIvhNbGRtUYksYBRwCXNDqWRpO0FfAe4FKAiPhLRLzY2KgaZiSwmaSRwOYMs/u3IuIu4Pma4snAzLQ8Ezgqr53BmixagSVV650M0y/IapLagLcC9zU2koa5EPgC8NdGB9IEdgO6gMtTt9wlkrZodFADLSKeAb4LLAaWASsj4tbGRtUUdoiIZZD9wQlsn7fDYE0WhaYGGU4kbQn8AvhcRLzU6HgGmqQjgRUR8UCjY2kSI4H9gYsj4q3AKxToahhqUl/8ZGBXYCdgC0nHNzaqwWmwJgtPDVJF0sZkieKqiLi+0fE0yIHAhyQ9RdYt+X5JP21sSA3VCXRGROUscxZZ8hhuDgGejIiuiHgNuB54V4NjagbLJe0IkN5X5O0wWJOFpwZJJImsX/rxiDi/0fE0SkScHRHjIqKN7N/D7RExbP+CjIg/Aksk7ZGKJgHD8dkwi4GJkjZP/1cmMQwH+uuYDUxJy1OAG/N2aJrpPvpiPacGGaoOBE4AFkial8q+FBE3NzAmaw6nAlelP6gWASc3OJ4BFxH3SZoFPEh25eBDDLNpPyRdA7wPGCupE/g6cC5wnaSpZAn1mNx2PN2HmZnlGazdUGZmNoCcLMzMLJeThZmZ5XKyMDOzXE4WZmaWy8nCBi1JL5fQ5n6SDq9aP0fS5zegvWPSjK931JS3SfrHAvufJOn763t8s/7iZGG2tv2Aw3NrFTcV+ExEHFxT3gbkJguzZuFkYUOCpP8j6X5J8yV9I5W1pb/qf5yeZ3CrpM3StreluvdI+k561sEmwDeBYyXNk3Rsan5vSXdKWiTptB6O/1FJC1I756WyrwEHAT+U9J2aXc4F3p2Oc4akUZIuT208JKk2uSDpiBTvWEktkn6Rfub7JR2Y6pyTnl+wVryStpA0R9LDKcZja9s361VE+OXXoHwBL6f3D5DdlSuyP4BuIpueu43srt39Ur3rgOPT8iPAu9LyucAjafkk4PtVxzgHuBvYFBgLPAdsXBPHTmR3wbaQzYpwO3BU2nYn2bMUamN/H3BT1fpZwOVpec/U3qhKPMD/Bn4DbJPqXA0clJZ3Jpvupcd4gQ8DP6463taN/v35Nbheg3K6D7MaH0ivh9L6lsAEsi/cJyOiMg3KA0CbpDHA6Ii4O5VfDRzZS/tzIuJV4FVJK4AdyCbqq3gbcGdEdAFIuoosWf2yDz/DQcC/AUTE7yU9DbwpbTsYaAc+EN0zCh9CdsZT2X8rSaN7iXcB8N101nNTRPymD7GZOVnYkCDg2xHxo7UKs+d7vFpV9DqwGfWnuO9NbRu1/2/62l49vbWxiOz5FG8COlLZRsA7I+JPazWSJY914o2I/5J0ANl4zLcl3RoR3+yHuG2Y8JiFDQW/Bj6enumBpFZJPT7MJSJeAFZJmpiKqh+zuQoYve5evboPeG8aSxgBfBT4/zn71B7nLuBjKf43kXUt/SFtexr4B+AKSW9OZbcCn63sLH8ta5MAAADVSURBVGm/3g4maSdgdUT8lOxhQMNxunLbAE4WNuhF9uSzq4F7JC0ge3ZD3hf+VGCGpHvI/qpfmcrvIOvemVd0EDiyJ42dnfZ9GHgwIvKmfJ4PrEkDzmcA/w6MSPFfC5yUupIqx/gDWTL5uaQ3kp4rnQbpHwM+nXO8vwN+l2Ym/jLwrSI/m1mFZ521YUnSlhHxclqeDuwYEac3OCyzpuUxCxuujpB0Ntn/gafJrjoysx74zMLMzHJ5zMLMzHI5WZiZWS4nCzMzy+VkYWZmuZwszMws1/8AUPwz92mI4B4AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "54\n",
      "0.9503\n"
     ]
    }
   ],
   "source": [
    "file = csv.reader(open('F:/dataset/0_O2O/train.csv','r',encoding='UTF-8'))\n",
    "x_train = []\n",
    "y_train = []\n",
    "for i in file:\n",
    "    i = str(i)[2:-2]\n",
    "    i,j = i.split('\\\\t')\n",
    "    x_train += [j]\n",
    "    y_train += [int(i)]\n",
    "print(len(x_train))\n",
    "\n",
    "# 进行分词和tokenize\n",
    "train_tokens = []\n",
    "for text in x_train:\n",
    "    # 去掉标点\n",
    "    text = re.sub(\"[\\s+\\.\\!\\/_,$%^*(+\\\"\\']+|[+——！，。？、~@#￥%……&*（）]+\", \"\",text)\n",
    "    # 结巴分词\n",
    "    cut = jieba.cut(text)\n",
    "    # 结巴分词的输出结果为一个生成器\n",
    "    # 把生成器转换为list\n",
    "    cut_list = [ i for i in cut ]\n",
    "    for i, word in enumerate(cut_list):\n",
    "        try:\n",
    "            # 将词转换为索引index\n",
    "            cut_list[i] = cn_model.vocab[word].index\n",
    "        except KeyError:\n",
    "            # 如果词不在字典中，则输出0\n",
    "            cut_list[i] = 0\n",
    "    train_tokens.append(cut_list)\n",
    "num_tokens = [ len(tokens) for tokens in train_tokens ]\n",
    "num_tokens = np.array(num_tokens)\n",
    "print('平均句长:',np.mean(num_tokens),'最长句长:',np.max(num_tokens))\n",
    "plt.hist(np.log(num_tokens), bins = 100)\n",
    "plt.xlim((0,10))\n",
    "plt.ylabel('number of tokens')\n",
    "plt.xlabel('length of tokens')\n",
    "plt.title('Distribution of tokens length')\n",
    "plt.show()\n",
    "max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)\n",
    "max_tokens = int(max_tokens)\n",
    "print(max_tokens)\n",
    "print(np.sum( num_tokens < max_tokens+3 ) / len(num_tokens))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "batchsz = 128\n",
    "num_words = 50000    #总单词数,只使用前50000个词\n",
    "max_review_len = max_tokens+3    #每句话字数\n",
    "embedding_dim = 300    #embedding长度\n",
    "units = 64             #中间层"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "embedding_matrix_shape: (50000, 300)\n",
      "(10000, 57)\n"
     ]
    }
   ],
   "source": [
    "#embdding\n",
    "# 初始化embedding_matrix，之后在keras上进行应用\n",
    "embedding_matrix = np.zeros((num_words, embedding_dim))\n",
    "# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵\n",
    "# 维度为 50000 * 300\n",
    "for i in range(num_words):\n",
    "    embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]\n",
    "embedding_matrix = embedding_matrix.astype('float32')\n",
    "print(\"embedding_matrix_shape:\",embedding_matrix.shape)\n",
    "\n",
    "#padding\n",
    "# x_train:[10000, ]# y_train:[10000, ] => x_train:[10000,57]# y_train:[10000, ]\n",
    "train_pad = keras.preprocessing.sequence.pad_sequences(train_tokens, maxlen=max_review_len,padding='pre', truncating='pre')\n",
    "train_pad[ train_pad>=num_words ] = 0 #超纲词补零\n",
    "print(train_pad.shape)\n",
    "db_train = tf.data.Dataset.from_tensor_slices((train_pad, y_train))\n",
    "db_train = db_train.shuffle(1000).batch(batchsz, drop_remainder=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING:tensorflow:Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING:tensorflow:Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING:tensorflow:Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING:tensorflow:Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function standard_lstm at 0x0000020ED075B510> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function standard_lstm at 0x0000020ED075B510>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING:tensorflow:Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n",
      "WARNING: Entity <function cudnn_lstm at 0x0000020ED075B598> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: converting <function cudnn_lstm at 0x0000020ED075B598>: AttributeError: module 'gast' has no attribute 'Num'\n"
     ]
    }
   ],
   "source": [
    "model = tf.keras.Sequential()\n",
    "model.add(tf.keras.layers.Embedding(num_words,embedding_dim,weights=[embedding_matrix],input_length=max_review_len,trainable=False))\n",
    "model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=32, return_sequences=True)))\n",
    "model.add(tf.keras.layers.LSTM(units=16, return_sequences=False))\n",
    "model.add(tf.keras.layers.Dense(1, activation='sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential_1\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 57, 300)           15000000  \n",
      "_________________________________________________________________\n",
      "bidirectional_1 (Bidirection (None, 57, 64)            85248     \n",
      "_________________________________________________________________\n",
      "lstm_3 (LSTM)                (None, 16)                5184      \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 1)                 17        \n",
      "=================================================================\n",
      "Total params: 15,090,449\n",
      "Trainable params: 90,449\n",
      "Non-trainable params: 15,000,000\n",
      "_________________________________________________________________\n",
      "Epoch 1/5\n",
      "78/78 [==============================] - 43s 548ms/step - loss: 0.1844 - accuracy: 0.9273\n",
      "Epoch 2/5\n",
      "78/78 [==============================] - 41s 527ms/step - loss: 0.1547 - accuracy: 0.9405\n",
      "Epoch 3/5\n",
      "78/78 [==============================] - 41s 528ms/step - loss: 0.1224 - accuracy: 0.9543\n",
      "Epoch 4/5\n",
      "78/78 [==============================] - 41s 524ms/step - loss: 0.1049 - accuracy: 0.9628\n",
      "Epoch 5/5\n",
      "78/78 [==============================] - 41s 527ms/step - loss: 0.1035 - accuracy: 0.9630\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x20f07a49898>"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.compile(optimizer = keras.optimizers.Adam(0.001),loss = tf.losses.BinaryCrossentropy(),metrics=['accuracy'])\n",
    "model.summary()\n",
    "model.fit(db_train, epochs=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "file = csv.reader(open('F:/dataset/0_O2O/test_new.csv','r',encoding='UTF-8'))\n",
    "f = open('sample.csv','w',encoding='utf-8',newline='')\n",
    "csv_writer = csv.writer(f)\n",
    "csv_writer.writerow([\"id\",\"label\"])#构建列表头\n",
    "\n",
    "for i in file:\n",
    "    i = str(i)[2:-2]\n",
    "    ID,text = i.split(\"', '\")\n",
    "#     print(ID)\n",
    "#     print(text)\n",
    "    text = re.sub(\"[\\s+\\.\\!\\/_,$%^*(+\\\"\\']+|[+——！，。？、~@#￥%……&*（）]+\", \"\",text)\n",
    "    cut = jieba.cut(text)\n",
    "    cut_list = [ i for i in cut ]\n",
    "    for i, word in enumerate(cut_list):\n",
    "        try:\n",
    "            cut_list[i] = cn_model.vocab[word].index\n",
    "        except KeyError:\n",
    "            cut_list[i] = 0\n",
    "    cut_list = keras.preprocessing.sequence.pad_sequences([cut_list], maxlen=max_review_len,padding='pre', truncating='pre')\n",
    "    cut_list[ cut_list>=num_words ] = 0 #超纲词补零\n",
    "    result = model.predict(cut_list)\n",
    "    if result>0.5:\n",
    "        result = 1\n",
    "    else:\n",
    "        result = 0\n",
    "    csv_writer.writerow([ID,result])\n",
    "f.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
